/***
This do-file produces a CPS by wage quartile benchmark figure.
***/

*-------------------------------------------------------------------------------
* Set up
*-------------------------------------------------------------------------------

* Set $root 
project figstabs, root
if (r(buildrunning) == 0) include "${root}/code/config_interactive.do"

* Set globals
project, uses("${root}/code/set_globals.do")
include "${root}/code/set_globals.do"
local category "Employment"

* Create required subfolders
cap mkdir "${root}/data/derived"
cap mkdir "${root}/data/derived/Employment"
cap mkdir "${root}/data/derived/CPS"
cap mkdir "${root}/results"
cap mkdir "${root}/results/Employment"
cap mkdir "${root}/results/paper numbers"
cap mkdir "${root}/results/paper numbers/`category'"

*-------------------------------------------------------------------------------
**# 1. Clean data
*-------------------------------------------------------------------------------

**## 1.1 State-level rent and population from ACS 
project, uses("${root}/data/derived/ACS 2014-2018 5-Year State/ACS 2014-2018 State.dta")
use "${root}/data/derived/ACS 2014-2018 5-Year State/ACS 2014-2018 State.dta", clear 

rename (state_fips pop_2014_2018 med_2br_rent_2018) (statefips pop_2018 median_2br_rent_2018)
keep median_2br_rent_2018 pop_2018 statefips
replace median_2br_rent_2018 = median_2br_rent_2018 / 1000                      // rescale var to be rent in thousands of dollars

tempfile acs 
save `acs'

**## 1.2. CPS

project, uses("${root}/data/dvc/CPS/cps_00037.dta")
use "${root}/data/dvc/CPS/cps_00037.dta", clear 

keep if age >= 16

* Create NAICS
gen naics = . 
replace naics = 11 if inrange(ind, 0170, 0290)
replace naics = 21 if inrange(ind, 0370, 0490)
replace naics = 23 if inrange(ind, 0770, 0770)
replace naics = 31 if inrange(ind, 1070, 1790)
replace naics = 32 if inrange(ind, 1870, 2590)
replace naics = 33 if inrange(ind, 2670, 3990)
replace naics = 42 if inrange(ind, 4070, 4590)
replace naics = 44 if inrange(ind, 4670, 5190)
replace naics = 45 if inrange(ind, 5275, 5790)
replace naics = 48 if inrange(ind, 6070, 6290)
replace naics = 49 if inrange(ind, 6370, 6390)
replace naics = 22 if inrange(ind, 0570, 0690)
replace naics = 51 if inrange(ind, 6470, 6780)
replace naics = 52 if inrange(ind, 6870, 6992)
replace naics = 53 if inrange(ind, 7070, 7190)
replace naics = 54 if inrange(ind, 7270, 7490)
replace naics = 55 if inrange(ind, 7570, 7570)
replace naics = 56 if inrange(ind, 7580, 7790)
replace naics = 61 if inrange(ind, 7860, 7890)
replace naics = 62 if inrange(ind, 7970, 8470)
replace naics = 71 if inrange(ind, 8560, 8590)
replace naics = 72 if inrange(ind, 8660, 8690)
replace naics = 81 if inrange(ind, 8770, 9290)
replace naics = 92 if inrange(ind, 9370, 9890)
						
* Be consistent with PIE and CES series
gen naics_code = ""
replace naics_code = "11" if naics == 11
replace naics_code = "21" if naics == 21
replace naics_code = "22" if naics == 22
replace naics_code = "23" if naics == 23
replace naics_code = "3133" if naics == 31 | naics == 32 | naics == 33
replace naics_code = "42" if naics == 42
replace naics_code = "4445" if naics == 44 | naics == 45
replace naics_code = "4849" if naics == 48 | naics == 49
replace naics_code = "51" if naics == 51
replace naics_code = "52" if naics == 52
replace naics_code = "53" if naics == 53
replace naics_code = "54" if naics == 54
replace naics_code = "55" if naics == 55
replace naics_code = "56" if naics == 56
replace naics_code = "61" if naics == 61
replace naics_code = "62" if naics == 62
replace naics_code = "71" if naics == 71
replace naics_code = "72" if naics == 72
replace naics_code = "81" if naics == 81
				
* Drop some sectors according to BLS adjustment 
drop if naics == 92 															// drop those working in public sector to match CES (Total Private Employment)
drop if naics == 11 															// drop those working in agriculture, forestry, fishing, and hunting according to BLS adjustment of CPS to CES
drop if naics == 9290 															// drop workers in private households such as nannies, housekeepers, etc.
			
* Drop some classes of workers according to BLS adjustment
drop if inlist(classwkr, 0, 13, 25, 26, 27, 28, 29) 							// drop missing (0), unincorporated, self-employed (13), and all public sector employees (25-29) 
			
* Convert to super sector
gen naics_ss = ""
replace naics_ss = "10" if inlist(naics_code, "11", "21")
replace naics_ss = "20" if inlist(naics_code, "23")
replace naics_ss = "30" if inlist(naics_code, "31-33")
replace naics_ss = "40" if inlist(naics_code, "42", "44-45", "48-49", "22")
replace naics_ss = "50" if inlist(naics_code, "51")
replace naics_ss = "55" if inlist(naics_code, "52", "53")
replace naics_ss = "60" if inlist(naics_code, "54", "55", "56")
replace naics_ss = "65" if inlist(naics_code, "61", "62")
replace naics_ss = "70" if inlist(naics_code, "71", "72")
replace naics_ss = "80" if inlist(naics_code, "81")
			
* Reformat NAICS codes  
replace naics_code = subinstr(naics_code, "-", "_", .) 
			
* Define hourly wages
cap drop wage
replace earnweek = . if earnweek > 9999 
replace uhrswork1 = . if uhrswork1 > 996
replace hourwage = . if hourwage > 999
replace hourwage = earnweek / uhrswork1 if mi(hourwage) & paidhour == 2  		// if paid hourly, divide weekly earnings by amount of hours usually worked
gen wage = hourwage if paidhour == 2
replace wage = earnweek / uhrswork1 if paidhour == 1

replace wage = 100 if wage > 100 & !mi(wage)
replace wage = 5 if wage < 5

* Limit end of period to Dec 2021 
gen date = mdy(month, 15, year)
format date %td
drop if date >= mdy(1,1,2022)

* Indicator for whether person is in initial survey period
assert !missing(mish)
gen initial_survey_period = inrange(mish, 1, 4) 

tempfile cps_cleaned 
save `cps_cleaned'

*-------------------------------------------------------------------------------
**# 2. CPS, panel
*-------------------------------------------------------------------------------

*-------------------------------------------------------------------------------
**## 2.1. Cleaning 
*-------------------------------------------------------------------------------

use `cps_cleaned', clear

* Keep those with jobs in initial period
cap drop temp
gen temp = inlist(empstat, 10) & initial_survey_period == 1
gegen initial_employed = max(temp), by(cpsidp)
keep if initial_employed == 1
drop temp initial_employed

* Keep those with follow-up survey in July 2020-February 2021
cap drop temp 
gen temp = (initial_survey_period == 0 & inrange(date, mdy(7, 1, 2020), mdy(2, 28, 2021)) & earnwt != 0)
gegen followup_oct_dec_2020 = max(temp), by(cpsidp)
keep if followup_oct_dec_2020 == 1
drop temp followup_oct_dec_2020

* Drop those with initial survey in March 2020 or later
cap drop temp 
gen temp = (initial_survey_period == 1 & date >= mdy(3, 1, 2020) & earnwt != 0)
gegen initial_mar_2020 = max(temp), by(cpsidp)
drop if initial_mar_2020 == 1
drop temp initial_mar_2020

* Keep those who are in sample in both periods 
cap drop temp 
gen temp = initial_survey_period == 1 & earnwt != 0
gegen in_initial_survey = max(temp), by(cpsidp)
drop temp 
gen temp = initial_survey_period == 0 & earnwt != 0
gegen in_followup_survey = max(temp), by(cpsidp)
keep if in_initial_survey == 1 & in_followup_survey == 1
drop temp in_initial_survey in_followup_survey

* Indicator for being employed in follow-up period 
cap drop temp 
gen temp = inlist(empstat, 10) & initial_survey_period == 0
gegen follow_up_employed = max(temp), by(cpsidp)
drop temp 

* Examine effective initial survey periods and follow-up survey periods 
tab date if initial_survey_period == 1 & earnwt != 0
assert inrange(date, mdy(7,1,2019), mdy(2,29,2020)) if initial_survey_period == 1 & earnwt != 0
tab date if initial_survey_period == 0 & earnwt != 0
assert inrange(date, mdy(7,1,2020), mdy(2,28,2021)) if initial_survey_period == 0 & earnwt != 0

*-------------------------------------------------------------------------------
**## 2.2. Adjust for integer wage bunching using random perturbation
*-------------------------------------------------------------------------------

* Add random noise to integer wages to avoid issues associated with wage bunching, limiting to integer wages we adjust for in Paychex for comparability
set seed 1280
gen random_noise = runiform(-0.5, 0.5)
replace wage = wage + random_noise if inlist(wage, 13, 14, 15, 16, 19, 20, 21, 22, 23)

* Merge poverty thresholds 
project, uses("${root}/data/dvc/Employment/poverty_thresholds.dta")
merge m:1 date using "${root}/data/dvc/Employment/poverty_thresholds.dta", assert(2 3) keep(3) nogen

* Gen quartiles
gen quartile = 1 if wage <= poverty_100 & !mi(wage)
replace quartile = 2 if wage > poverty_100 & wage <= poverty_150 & !mi(wage)
replace quartile = 3 if wage > poverty_150 & wage <= poverty_250 & !mi(wage)
replace quartile = 4 if wage > poverty_250 & !mi(wage)

* Quartile of each person when they were first in sample 
cap drop temp
gegen temp = mean(quartile) if initial_survey_period == 1, by(cpsidp)
gegen initial_quartile = mean(temp), by(cpsidp)                  				// just fills in missing values
drop temp

* Keep if in followup period to avoid double-counting
gen in_sample = (earnwt != 0)
gegen times_in_sample = sum(in_sample), by(cpsidp)
assert times_in_sample == 2
keep if in_sample == 1 & initial_survey_period == 0

* Collapse by state and initial quartile split
drop if missing(initial_quartile)
gcollapse follow_up_employed [pweight = earnwt], by(statefip initial_quartile)

* Merge rent and population
rename statefip statefips                                              
merge m:1 statefips using `acs', keep(match) nogen

* Express probability of employment in percentage terms 
replace follow_up_employed = follow_up_employed * 100

* Rent gradients 
foreach split in 1 2 3 4 {
	reg follow_up_employed median_2br_rent_2018 [w = pop_2018] if initial_quartile == `split', r
	reg follow_up_employed median_2br_rent_2018 [w = pop_2018] if initial_quartile == `split' & !inlist(statefips, 6, 25, 36), r
	local cps_panel_beta_`split' = _b[median_2br_rent_2018]
	local cps_panel_ll_`split' = r(table)[5,1]
	local cps_panel_ul_`split' = r(table)[6,1]
}

* Summary stats 
forval q = 1/4 {
	sum follow_up_employed [w = pop_2018] if initial_quartile == `q' & !inlist(statefips, 6, 25, 36)
	
	if `q' > 1 {
		gen q1_vs_q`q' = (initial_quartile == 1) if inlist(initial_quartile, 1, `q')
		reg follow_up_employed q1_vs_q`q' [w = pop_2018] if !inlist(statefips, 6, 25, 36), r
		local q1_vs_q`q'_diff: di %3.2f abs(_b[q1_vs_q`q'])
	}
}

* Scatterplots
statastates, fips(statefips)
replace median_2br_rent_2018 = median_2br_rent_2018 * 1000
replace follow_up_employed = follow_up_employed - 100

scatter follow_up_employed median_2br_rent_2018 [w = pop_2018] if initial_quartile == 1 & !inlist(statefips, 6, 25, 36), mlabel(state_abbrev) msize(tiny) ///
title("CPS Panel, Q1", size(medsmall)) ytitle("Change in Employment (%)" "From Jul 2019-Feb 2020 to Jul 2020-Feb 2021") xtitle("Median Two-Bedroom Rent in 2014-2018 ($)")

scatter follow_up_employed median_2br_rent_2018 [w = pop_2018] if initial_quartile == 2 & !inlist(statefips, 6, 25, 36), mlabel(state_abbrev) msize(tiny) ///
title("CPS Panel, Q2", size(medsmall)) ytitle("Change in Employment (%)" "From Jul 2019-Feb 2020 to Jul 2020-Feb 2021") xtitle("Median Two-Bedroom Rent in 2014-2018 ($)")

scatter follow_up_employed median_2br_rent_2018 [w = pop_2018] if initial_quartile == 3 & !inlist(statefips, 6, 25, 36), mlabel(state_abbrev) msize(tiny) ///
title("CPS Panel, Q3", size(medsmall)) ytitle("Change in Employment (%)" "From Jul 2019-Feb 2020 to Jul 2020-Feb 2021") xtitle("Median Two-Bedroom Rent in 2014-2018 ($)")

scatter follow_up_employed median_2br_rent_2018 [w = pop_2018] if initial_quartile == 4 & !inlist(statefips, 6, 25, 36), mlabel(state_abbrev) msize(tiny) ///
title("CPS Panel, Q4", size(medsmall)) ytitle("Change in Employment (%)" "From Jul 2019-Feb 2020 to Jul 2020-Feb 2021") xtitle("Median Two-Bedroom Rent in 2014-2018 ($)")

*-------------------------------------------------------------------------------
**# 3. Paychex 
*-------------------------------------------------------------------------------

*-------------------------------------------------------------------------------
**## 3.1. State-level 
*-------------------------------------------------------------------------------
project, uses("${root}/data/web/data/Employment - State - Weekly.csv")
import delimited "${root}/data/web/data/Employment - State - Weekly.csv", clear
keep if (year == 2020 & inrange(month, 7, 12)) | (year == 2021 & inrange(month, 1, 2))

gcollapse emp_incq*, by(statefips)                        // take mean over dates within state, so no pop weights needed   

merge m:1 statefips using `acs', assert(2 3) keep(3) nogen 

foreach split in 1 2 3 4 {
	replace emp_incq`split' = emp_incq`split' * 100
	
	reg emp_incq`split' median_2br_rent_2018 [w = pop_2018], r
	
	reg emp_incq`split' median_2br_rent_2018 [w = pop_2018] if !inlist(statefips, 6, 25, 36), r
	local paychex_beta_`split' = _b[median_2br_rent_2018]
	local paychex_ll_`split' = r(table)[5,1]
	local paychex_ul_`split' = r(table)[6,1]
}

* Scatterplots
statastates, fips(statefips)
replace median_2br_rent_2018 = median_2br_rent_2018 * 1000

scatter emp_incq1 median_2br_rent_2018 [w = pop_2018] if !inlist(statefips, 6, 25, 36), mlabel(state_abbrev) msize(tiny) ///
title("Paychex-Intuit, Q1", size(medsmall)) ytitle("Change in Employment (%)" "From Jul 2019-Feb 2020 to Jul 2020-Feb 2021") xtitle("Median Two-Bedroom Rent in 2014-2018 ($)")

scatter emp_incq2 median_2br_rent_2018 [w = pop_2018] if !inlist(statefips, 6, 25, 36), mlabel(state_abbrev) msize(tiny) ///
title("Paychex-Intuit, Q2", size(medsmall)) ytitle("Change in Employment (%)" "From Jul 2019-Feb 2020 to Jul 2020-Feb 2021") xtitle("Median Two-Bedroom Rent in 2014-2018 ($)")

scatter emp_incq3 median_2br_rent_2018 [w = pop_2018] if !inlist(statefips, 6, 25, 36), mlabel(state_abbrev) msize(tiny) ///
title("Paychex-Intuit, Q3", size(medsmall)) ytitle("Change in Employment (%)" "From Jul 2019-Feb 2020 to Jul 2020-Feb 2021") xtitle("Median Two-Bedroom Rent in 2014-2018 ($)")

scatter emp_incq4 median_2br_rent_2018 [w = pop_2018] if !inlist(statefips, 6, 25, 36), mlabel(state_abbrev) msize(tiny) ///
title("Paychex-Intuit, Q4", size(medsmall)) ytitle("Change in Employment (%)" "From Jul 2019-Feb 2020 to Jul 2020-Feb 2021") xtitle("Median Two-Bedroom Rent in 2014-2018 ($)")

*-------------------------------------------------------------------------------
**# 4. Plot 
*-------------------------------------------------------------------------------

* Specifications: (1) Paychex, state (2) CPS, panel with Paychex adjustment
clear 
set obs 11 
gen spec = ""
gen split = ""
gen beta = .
gen lower_ci = .
gen upper_ci = .

local i = 0

foreach split in 1 2 3 4 {
	
	if `split' != 1 local i = `i' + 1                                   		// get some space on the graph
	
	foreach spec in "paychex" "cps_panel" {
		local i = `i' + 1
		replace spec = "`spec'" in `i'
		replace split = "`split'" in `i'
		replace beta = ``spec'_beta_`split'' in `i'
		replace lower_ci = ``spec'_ll_`split'' in `i'
		replace upper_ci = ``spec'_ul_`split'' in `i'
	}
}

* Variable to order estimates on graph
gen var_order = _n
		
* CI plot 
keep if spec == "cps_panel"
destring split, replace
replace var_order = 0 + split * 0.1

twoway (rcap upper_ci lower_ci var_order, color(oi1) vertical msize(medsmall) lwidth(medthin)) ///
	   (scatter beta var_order, mcolor(oi1) msize(medium)) ///
		, ///
		ytitle("Change in Employment" "vs. Rent Slope (%/$1000)") ///
		ylabel(-15 "-15%" -10 "-10%" -5 "-5%" 0 "0%", nogrid labsize(small)) ///
		legend(off) ///
		xtitle("") xsc(ra(0.05 0.45)) ///
		xlabel(0.1 "Bottom Quartile" 0.2 "Second Quartile" 0.3 "Third Quartile" 0.4 "Top Quartile", nogrid notick labsize(medsmall))
				  
oi_graph_export "${root}/results/Employment/CPS Panel Benchmark - Rent Gradient by Income Quartile", type(${fig_type})

*-------------------------------------------------------------------------------
**# 5. Export numbers to CSV file 
*-------------------------------------------------------------------------------

di `q1_vs_q4_diff'

cap erase "${root}/results/paper numbers/`category'/CPS Panel Employment Change Q1 vs Q4.yaml"

yamlout using "${root}/results/paper numbers/`category'/CPS Panel Employment Change Q1 vs Q4.yaml", ///
	key("q1_vs_q4_diff") ///
	comment("Difference in prob. of employment in follow-up survey between Q1 and Q4") ///
	value(`q1_vs_q4_diff') fmt(%2.1f)

project, creates("${root}/results/paper numbers/`category'/CPS Panel Employment Change Q1 vs Q4.yaml")
